source("../GAA-EVAL.R")
## Warning: package 'ggplot2' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## Warning: package 'pROC' was built under R version 3.4.4
## Warning: package 'reshape2' was built under R version 3.4.3
## Warning: package 'ggrepel' was built under R version 3.4.4
## Warning: package 'gmodels' was built under R version 3.4.4
## Warning: package 'stringr' was built under R version 3.4.4
exp.data <- read.RealData(file = "exp_data.csv", sep = ",",
col.id = 2, col.value = 5, col.sd = 6)
# inspect the experimental value
head(exp.data$value)
# inspect the experimental sd
head(exp.data$sd)
sub.data <- read.Submission.Folder(folder.name = "prediction/",col.id = 1,
col.value = 2, col.sd = 3, real.data = exp.data)
head(sub.data$value)
# inspect the dimension
dim(sub.data$value)
Apply the z-score transformation for scatterplot to unify the scale of predicted value
plot_all_scatter(real.data = exp.data, pred.data = sub.data, z.transform = TRUE)
## Warning: Removed 4 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing missing values (geom_point).
without bootstrap method = “pearson”: In this example, we use pearson correlation sd.use = 0.3: experimental value with sd larger than 0.3 is filtered out z.transformation does not have effect to pearson correlation
# 1. Render coefficient value
result.cor.pearson <- eval.Correlation(real.data = exp.data, pred.data = sub.data,
method = "pearson", sd.use = 0.3,z.transform = TRUE)
head(result.cor.pearson)
## pearson.coefficient.n=355.sd<0.3
## Group_1-prediction_file_nocomments-1 0.3108669
## Group_1-prediction_file_nocomments-2 0.2509373
## Group_1-prediction_file_nocomments-3 0.2175485
## Group_2-prediction_file_nocomments-1 0.2723800
## Group_2-prediction_file_nocomments-2 0.2800299
## Group_3-prediction_file_nocomments-1 0.3149842
## p.value
## Group_1-prediction_file_nocomments-1 2.157713e-09
## Group_1-prediction_file_nocomments-2 1.683052e-06
## Group_1-prediction_file_nocomments-3 3.563631e-05
## Group_2-prediction_file_nocomments-1 1.859902e-07
## Group_2-prediction_file_nocomments-2 8.082396e-08
## Group_3-prediction_file_nocomments-1 1.287089e-09
# 2. Plot Correlation
plot.Correlation(result.cor.pearson, "Pearson")
Bootstrap using row replacement (rep.time = 500 as a fixed input for now) boot = T, boot.var = F
# 1. Render coefficient value
boot.result.cor.pearson <- eval.Correlation(real.data = exp.data, pred.data = sub.data,
method = "pearson", sd.use = 0.3,z.transform = TRUE,boot = T)
# For Correlation-based Evaluation, provide mean, CI, and median pval
head(boot.result.cor.pearson)
## avg low_ci high_ci
## Group_1-prediction_file_nocomments-1 0.3066170 0.2275263 0.3821523
## Group_1-prediction_file_nocomments-2 0.2465266 0.1653698 0.3276328
## Group_1-prediction_file_nocomments-3 0.2148069 0.1269072 0.2951286
## Group_2-prediction_file_nocomments-1 0.2701282 0.1902947 0.3481628
## Group_2-prediction_file_nocomments-2 0.2771144 0.1954804 0.3503623
## Group_3-prediction_file_nocomments-1 0.3118958 0.2242984 0.3884650
## sd p.value
## Group_1-prediction_file_nocomments-1 0.04901241 3.652800e-09
## Group_1-prediction_file_nocomments-2 0.05023731 2.990593e-06
## Group_1-prediction_file_nocomments-3 0.05269642 3.926904e-05
## Group_2-prediction_file_nocomments-1 0.04706952 2.759994e-07
## Group_2-prediction_file_nocomments-2 0.04686972 1.029971e-07
## Group_3-prediction_file_nocomments-1 0.05113794 1.483798e-09
# 2. Plot Correlation
plot.Correlation(boot.result.cor.pearson, "Pearson",boot = TRUE)
Bootstrap using distribution modelling (rep.time = 500 as a fixed input for now) boot = T, boot.var = T
bootvar.result.cor.pearson <- eval.Correlation(real.data = exp.data, pred.data = sub.data,
method = "pearson", sd.use = 0.3,z.transform = TRUE,boot = T,boot.var = T )
head(bootvar.result.cor.pearson)
## avg low_ci high_ci
## Group_1-prediction_file_nocomments-1 0.3098081 0.3040298 0.3159479
## Group_1-prediction_file_nocomments-2 0.2500853 0.2443017 0.2562133
## Group_1-prediction_file_nocomments-3 0.2169032 0.2112245 0.2229135
## Group_2-prediction_file_nocomments-1 0.2714877 0.2648111 0.2774905
## Group_2-prediction_file_nocomments-2 0.2790994 0.2730558 0.2852922
## Group_3-prediction_file_nocomments-1 0.3140110 0.3076190 0.3210243
## sd p.value
## Group_1-prediction_file_nocomments-1 0.003568889 2.426110e-09
## Group_1-prediction_file_nocomments-2 0.003564544 1.828919e-06
## Group_1-prediction_file_nocomments-3 0.003507820 3.785398e-05
## Group_2-prediction_file_nocomments-1 0.003668964 2.067416e-07
## Group_2-prediction_file_nocomments-2 0.003569346 8.890819e-08
## Group_3-prediction_file_nocomments-1 0.004036481 1.443387e-09
plot.Correlation(bootvar.result.cor.pearson, "Pearson",boot = TRUE)
- (for demonstration, only present the bootstrapped result)
- variance.normalization: defined on PPT
- density.distance: RMSD based on distribution model of experimental value
# without variance.normalization + without density.distance
boot.result.rmsd4 <- eval.RMSD(real.data = exp.data, pred.data = sub.data,sd.use = NA,
density.distance = FALSE,variance.normalization = FALSE,boot = TRUE)
head(boot.result.rmsd4)
## RMSD low_ci high_ci
## Group_1-prediction_file_nocomments-1 0.4010775 0.3805267 0.4221817
## Group_1-prediction_file_nocomments-2 0.4104334 0.3877751 0.4325619
## Group_1-prediction_file_nocomments-3 0.4287069 0.4054087 0.4517231
## Group_2-prediction_file_nocomments-1 0.4410671 0.4190720 0.4640579
## Group_2-prediction_file_nocomments-2 0.4243437 0.4034699 0.4462245
## Group_3-prediction_file_nocomments-1 0.4631417 0.4380073 0.4889377
## sd
## Group_1-prediction_file_nocomments-1 0.01360405
## Group_1-prediction_file_nocomments-2 0.01414572
## Group_1-prediction_file_nocomments-3 0.01421772
## Group_2-prediction_file_nocomments-1 0.01392664
## Group_2-prediction_file_nocomments-2 0.01319777
## Group_3-prediction_file_nocomments-1 0.01537482
plot.RMSD(boot.result.rmsd4, method="",boot = TRUE)
# with variance.normalization + without density.distance
boot.result.rmsd2 <- eval.RMSD(real.data = exp.data, pred.data = sub.data,sd.use = NA,
density.distance = FALSE,variance.normalization = TRUE,boot = TRUE)
head(boot.result.rmsd2)
## RMSD low_ci high_ci sd
## Group_1-prediction_file_nocomments-1 2.252916 2.056278 2.456234 0.1230616
## Group_1-prediction_file_nocomments-2 2.290883 2.098477 2.482414 0.1207208
## Group_1-prediction_file_nocomments-3 2.432993 2.226952 2.654717 0.1333272
## Group_2-prediction_file_nocomments-1 2.363973 2.161389 2.578943 0.1275932
## Group_2-prediction_file_nocomments-2 2.356854 2.158908 2.571599 0.1255129
## Group_3-prediction_file_nocomments-1 1.872031 1.716226 2.034078 0.1000889
plot.RMSD(boot.result.rmsd2, method="",boot = TRUE)
# with variance.normalization + with density.distance
boot.result.rmsd <- eval.RMSD(real.data = exp.data, pred.data = sub.data,sd.use = NA,
density.distance = TRUE, variance.normalization = TRUE, boot = TRUE)
head(boot.result.rmsd)
## RMSD low_ci high_ci sd
## Group_1-prediction_file_nocomments-1 2.270940 2.090101 2.448806 0.1085032
## Group_1-prediction_file_nocomments-2 2.306305 2.136658 2.485175 0.1057066
## Group_1-prediction_file_nocomments-3 2.449651 2.259246 2.653981 0.1204197
## Group_2-prediction_file_nocomments-1 2.374673 2.184303 2.555388 0.1142511
## Group_2-prediction_file_nocomments-2 2.369393 2.184681 2.547442 0.1120477
## Group_3-prediction_file_nocomments-1 1.886298 1.731956 2.042980 0.0938061
plot.RMSD(boot.result.rmsd, method="",boot = TRUE)
threshold = 0.6: The cutoff for positive/negative value
result.auc.0.6 <- eval.AUC(real.data =exp.data, pred.data = sub.data,
threshold = 0.6)
head(result.auc.0.6$results)
## AUC.0.6 sensitivity specificity
## Group_1-prediction_file_nocomments-1 0.6070 0.7054 0.4518
## Group_1-prediction_file_nocomments-2 0.5775 0.6977 0.4693
## Group_1-prediction_file_nocomments-3 0.5743 0.7287 0.3991
## Group_2-prediction_file_nocomments-1 0.5811 0.7674 0.3509
## Group_2-prediction_file_nocomments-2 0.5960 0.7674 0.3640
## Group_3-prediction_file_nocomments-1 0.6335 0.7674 0.4079
## accuracy precision f1_score bAccu
## Group_1-prediction_file_nocomments-1 0.5434 0.4213 0.5275 0.5786
## Group_1-prediction_file_nocomments-2 0.5518 0.4265 0.5294 0.5835
## Group_1-prediction_file_nocomments-3 0.5182 0.4069 0.5222 0.5639
## Group_2-prediction_file_nocomments-1 0.5014 0.4008 0.5266 0.5592
## Group_2-prediction_file_nocomments-2 0.5098 0.4057 0.5308 0.5657
## Group_3-prediction_file_nocomments-1 0.5378 0.4231 0.5455 0.5877
plot.AUC(result.auc.0.6)
# for all the submission files
result.bM.spearman <-eval.Correlation.Between(real.data = exp.data, pred.data = sub.data,
method = "spearman",sd.use = NA,z.transform = TRUE)
plot.Correlation.Between(result.bM.spearman$coefficient, method="spearman")
# for best submission of each group
result.bM.pearson <-eval.Correlation.Between(real.data = exp.data, pred.data = sub.data,
method = "pearson",sd.use = NA,z.transform = TRUE,grouped = TRUE)
## Warning: package 'bindrcpp' was built under R version 3.4.4
plot.Correlation.Between(result.bM.pearson$coefficient, method="pearson")
result.pCor <- eval.Partial.Correlation(real.data = exp.data, pred.data = sub.data, method = "spearman")
## Loading required package: ppcor
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 3.4.4
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
plot.Correlation.Between(result.bM.spearman$coefficient, method="Spearman")
total = cbind(real = exp.data$value,sub.data$value)
Plot.PCA(na.omit(total), labels=F, legend=TRUE)
uniqueness as adj.r^2 difference between total linear model and linear models without certain group use.ci = T : if true error bar as confidence interval; if false, error bar as standard deviation
# resampling row
result.uniq = eval.uniqueness(real.data = exp.data, pred.data = sub.data,boot = TRUE)
result.uniq
## uniqueness low_ci high_ci
## Group_1-prediction_file_nocomments-1 0.0001689140 -0.002436104 0.007191532
## Group_2-prediction_file_nocomments-1 0.0006190618 -0.002458364 0.009810393
## Group_3-prediction_file_nocomments-1 0.0046221322 -0.002334226 0.018613228
## Group_4-prediction_file_nocomments-1 0.0017466180 -0.002418622 0.010884977
## Group_5-prediction_file_nocomments-1 0.0002317603 -0.006121336 0.010123986
## Group_6-prediction_file_nocomments-1 0.0018855771 -0.002440361 0.014276658
## Group_7-prediction_file_nocomments-1 0.0318935936 0.009125475 0.058443523
## sd
## Group_1-prediction_file_nocomments-1 0.003363947
## Group_2-prediction_file_nocomments-1 0.004297741
## Group_3-prediction_file_nocomments-1 0.007274703
## Group_4-prediction_file_nocomments-1 0.004879735
## Group_5-prediction_file_nocomments-1 0.005230960
## Group_6-prediction_file_nocomments-1 0.005725523
## Group_7-prediction_file_nocomments-1 0.015416475
plot.uniqueness(result.uniq, method="", boot = TRUE,use.ci = T)
plot.uniqueness(result.uniq, method="", boot = TRUE,use.ci = F)
# generating from experimental distribution
result.bootvar.uniq = eval.uniqueness(real.data = exp.data, pred.data = sub.data,boot = TRUE,boot.var = T)
result.bootvar.uniq
## uniqueness low_ci
## Group_1-prediction_file_nocomments-1 -0.0018969667 -0.0024850231
## Group_2-prediction_file_nocomments-1 -0.0016130751 -0.0024285225
## Group_3-prediction_file_nocomments-1 0.0020983826 -0.0008795615
## Group_4-prediction_file_nocomments-1 -0.0003706045 -0.0019489749
## Group_5-prediction_file_nocomments-1 -0.0023593861 -0.0038966287
## Group_6-prediction_file_nocomments-1 -0.0007913476 -0.0021447765
## Group_7-prediction_file_nocomments-1 0.0292259365 0.0235040612
## high_ci sd
## Group_1-prediction_file_nocomments-1 -0.0007528102 0.0005996154
## Group_2-prediction_file_nocomments-1 -0.0003943794 0.0006703160
## Group_3-prediction_file_nocomments-1 0.0064608075 0.0021915123
## Group_4-prediction_file_nocomments-1 0.0018032103 0.0011808524
## Group_5-prediction_file_nocomments-1 -0.0003875242 0.0010427794
## Group_6-prediction_file_nocomments-1 0.0008691634 0.0009654445
## Group_7-prediction_file_nocomments-1 0.0355077126 0.0036533292
plot.uniqueness(result.bootvar.uniq, method="",boot = TRUE,use.ci = F)
res = eval.correctness(real.data = exp.data, pred.data = sub.data,threshold = 0.5,sd.use = NA,lower.positive = F,z.transform = F,"5nn3")
head(res)
## # A tibble: 6 x 2
## res avg.correctness
## <chr> <dbl>
## 1 4 0.115
## 2 6 0.885
## 3 8 0.5
## 4 11 0.359
## 5 12 0.635
## 6 15 0.962
Then load cagi_plot.py module into pymol and call plot_cagi function
PlotCorrectness(prot_name,path)The function output a .pse file for the protein Red: correctness closer to 100% Green: correctness towards 0%